#REPLICATION OF BLOOM, SCHANKERMAN, & VAN REENEN 2013
#Gov 2001
#Replication authors: Carlos Carpi, Ian Lundberg, and Rohan Thavarajah
#Authors listed alphabetically

#install.packages("foreign")
rm(list=ls(all=TRUE))
library(foreign)
#YOU NEED TO CHANGE THIS LINE TO THE FILE LOCATION ON YOUR COMPUTER
setwd("C:\\Users\\rthavarajah\\Desktop\\Paper Stuff\\Bloom Master\\Spillovers Computation\\7 output - jaffe and mahalanobis combined")
statadata <- read.dta("spillover_final2.dta")

#NOTES:
#I first read in their Stata data.
#Then, I use R code to do the same things they do in the Stata do file.
#Throughout, I name the models t#c# to indicate table # column #
#I define some of the key variables in comments (not all of them) before running models.

#Key methods: Use lm() to estimate the model with the coefficients.
#Apply NeweyWest() to the lm() object to get Newey-West corrected standard errors,
#which account for serial autocorrelation of order 1 and arbitrary heteroskedasticity.
#NeweyWest has no impact on the coefficients, only the standard errors.
#For IV regression, use ivreg (from AER package) and the adjust standard errors with NeweyWest.

#The line below is just an adjustment so we get decimal output rather than exponential notation
	options("scipen" = 10)


#Summary of what variables are (for Table 2)
#tobinq: Tobin's Q
#rmkvaf: Market value
#grd: R&D stock
#grd_k1: R&D stock/fixed capital
#rxrd: R&D flow
#gspilltec: SPILLTECH
#gspillsic: SPILLSIC
#pat_count: Patent flow
#pat_cite: Cite-weighted patents
#rsales: Sales
#rppent: Fixed capital
#emp: Employment (in thousands)

#For table 2, we only want years 1981-2001
	data_1981_2001 <- statadata[which(statadata$year>=1981 & statadata$year<=2001), ]
	table2_vars <- c("tobinq", "rmkvaf", "grd", "grd_k1", "rxrd", "gspilltec", "gspillsic", "pat_count", "pat_cite", "rsales", "rppent", "emp")
	summary(data_1981_2001[table2_vars])
	
	#Make the output into a table
	table2_line <- function(variable) {
		return(cbind(median(variable, na.rm=TRUE),mean(variable, na.rm=TRUE),sqrt(var(variable, na.rm=TRUE))))
	}
	#OUTPUT FOR TABLE 2
	rbind(c("Median","Mean","Standard Deviation"),t(as.matrix(apply(data_1981_2001[table2_vars],2,table2_line))))

#For table 3, we also need the following:
#lq: ln(Tobin's Q)
#lgspilltec1: ln(SPILLTECH)_(t-1), Jaffe distance measure
#lgspillsic1: ln(SPILLSIC)_(t-1), Jaffe distance measure
#lgspillmaltec1, Mahalanobis distance measure
#lgspillmalsic1, Mahalanobis distance measure
#grd_k1 + grd_kt2 + grd_kt3 + grd_kt4 + grd_kt5: (R&D Stock / Capital Stock)^n_(t-1)
	#The above is included as a 6th order polynomial, only first term shown
#grd_k1_dum: dummy variable for observations where lagged R&D stock is 0
#lsales_ind: Current industry sales in each firm's output industry
#lsales_ind1: Lagged sales in each firm's output industry
#factor(year): Full set of year dummies
#factor(num): Full set of firm dummies

#Instruments
#lgspilltecIV1: ln(TECHTAX)
#lgspillsicIV1: ln(SICTAX)

#lfirm: Federal tax credit value
#lstate: State tax credit value
#firm_dum: 
library(sandwich)

N_tab3 <- NULL
tab3_col2_coef <- NULL
tab3_col2_se <- NULL
tab3_col5_coef <- NULL
tab3_col5_se <- NULL

for (i in 1984:1997) {
  i <- as.character(i)
  data_table3 <- statadata[which(statadata$year>1984 & statadata$year<2001), ]
  data_table3_nm <- na.omit(data_table3[c("lq",paste("lgspilltec_ciq1",i,sep=""),"lgspillsic_ciq1", "grd_k1", "grd_k1_dum", "grd_kt2", "grd_kt3", "grd_kt4", "grd_kt5", "grd_kt6", "lsales_ind", "lsales_ind1", "year", "num", paste("lgspillmaltec_ciq1",i,sep=""), "lgspillmalsic_ciq1", "lgspilltecIV1", "lgspillsicIV1", "lfirm", "lstate", "firm_dum")])
  N_tab3[i] <- nrow(data_table3_nm)
  
  #Table 3, Column 2: OLS Jaffe, with firm fixed effects
  model <- paste("lq ~ lgspilltec_ciq1",i," + lgspillsic_ciq1 + grd_k1 + grd_k1_dum + grd_kt2 + grd_kt3 + grd_kt4 + grd_kt5 + grd_kt6 + lsales_ind + lsales_ind1 + factor(year) + factor(num)",sep="")
  t3c2 <- lm(model, data = data_table3_nm)
  t3c2_se <- sqrt(diag(NeweyWest(t3c2, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
  t3c2_results <- cbind(t3c2$coef,t3c2_se)
  tab3_col2_coef[i] <- t3c2_results[paste("lgspilltec_ciq1",i,sep=""),1]
  tab3_col2_se[i] <- t3c2_results[paste("lgspilltec_ciq1",i,sep=""),2]
  
  #Table 3, Column 5: OLS Mahalanobis, with firm fixed effects
  model <- paste("lq ~ lgspillmaltec_ciq1",i," + lgspillmalsic_ciq1 + grd_k1 + grd_k1_dum + grd_kt2 + grd_kt3 + grd_kt4 + grd_kt5 + grd_kt6 + lsales_ind + lsales_ind1 + factor(year) + factor(num)",sep="")
  t3c5 <- lm(model, data = data_table3_nm)
  t3c5_se <- sqrt(diag(NeweyWest(t3c5, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
  t3c5_results <- cbind(t3c5$coef,t3c5_se)
  tab3_col5_coef[i] <- t3c5_results[paste("lgspillmaltec_ciq1",i,sep=""),1]
  tab3_col5_se[i] <- t3c5_results[paste("lgspillmaltec_ciq1",i,sep=""),2]  
}
  

N_tab4 <- NULL
tab4_col3_coef <- NULL
tab4_col3_se <- NULL
tab4_col4_coef <- NULL
tab4_col4_se <- NULL

for (i in 1984:1997) {
  i <- as.character(i)
  data_table4 <- statadata[which(statadata$year<1999 & statadata$year>(statadata$fyear+statadata$prior_years)), ]
  data_table4_nm <- na.omit(data_table4[c("pat_cite", "lpat_cite1", "lpat_cite1_dum", "lgrd1", "lgrd1_dum", paste("lgspilltec_ciq1",i,sep=""), "lgspillsic_ciq1", "lsales1", "lpriorpat_cite", "lpriorpat_cite_dum", "num", "year", paste("lgspillmaltec_ciq1",i,sep=""), "lgspillmalsic_ciq1", "lgspilltecIV1", "lgspillsicIV1", "year", "sic", "lghxrd1")])
  #Check that we have 9,023 observations
  N_tab4 <- nrow(data_table4_nm)
  #In this table, standard errors are clustered rather than corrected with Newey-White
  #All models include time dummies and industry dummies
  #Merge software together as otherwise does not produce standard errors as too many industry codes
  data_table4_nm$sic[data_table4_nm$sic==6211] <- 999
  data_table4_nm$sic[data_table4_nm$sic==7370] <- 7373
  #install.packages("MASS")
  library(MASS)
  #Standard error issue for negative binomial (paper clusters SEs)
  #Some code to estimate cluster-robust standard errors.
  #Taken from http://projects.iq.harvard.edu/files/gov2001/files/section7_2014_print.pdf, slides 77-78
  cl.error <- function (model, clustvar) {
    fc <- clustvar
    m <- length(unique(fc))
    k <- length(coef(model))
    u <- estfun(model)
    u.clust <- matrix(NA, nrow=m, ncol=k)
    for(j in 1:k){
      u.clust[,j] <- tapply(u[,j], fc, sum)
    }
    return(sqrt(diag(vcov(model) %*% ((m / (m-1)) * t(u.clust) %*% (u.clust)) %*% vcov(model))))
  }

  #Table 4, Column 3: Model from column 2, with lagged patents (lpat_cite1)
  model <- paste("pat_cite ~ lgspilltec_ciq1",i," + lgspillsic_ciq1 + lgrd1 + lpat_cite1 + lpriorpat_cite + lpat_cite1_dum + lpriorpat_cite_dum + lgrd1_dum + lsales1 + factor(sic) + factor(year)",sep="")
  t4c3 <- glm.nb(model, data = data_table4_nm)
  coefs <- as.matrix(t4c3$coef)
  #t4c3_se <- sqrt(diag(vcov(t4c3)))
  #t4c3_se_cl <- cl.error(model = t4c3, clustvar = data_table4_nm$num)
  t4c3_se_nw <- sqrt(diag(NeweyWest(t4c3, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
  se <- as.matrix(t4c3_se_nw)
  tab4_col3_coef[i] <- coefs[paste("lgspilltec_ciq1",i,sep=""),1]
  tab4_col3_se[i] <- se[paste("lgspilltec_ciq1",i,sep=""),1]  
  
  #Table 4, Column 4: NBin Mahalanobis (otherwise same as column 3)
  model <- paste("pat_cite ~ lgspillmaltec_ciq1",i," + lgspillmalsic_ciq1 + lgrd1 + lpat_cite1 + lpriorpat_cite + lpat_cite1_dum + lpriorpat_cite_dum + lgrd1_dum + lsales1 + factor(sic) + factor(year)",sep="")
  t4c4 <- glm.nb(model, data = data_table4_nm)
  coefs <- as.matrix(t4c4$coef)
  #t4c4_se <- sqrt(diag(vcov(t4c4)))
  #t4c4_se_cl <- cl.error(model = t4c4, clustvar = data_table4_nm$num)
  t4c4_se_nw <- sqrt(diag(NeweyWest(t4c4, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
  se <- as.matrix(t4c4_se_nw)
  tab4_col4_coef[i] <- coefs[paste("lgspillmaltec_ciq1",i,sep=""),1]
  tab4_col4_se[i] <- se[paste("lgspillmaltec_ciq1",i,sep=""),1] 
}  
  


N_tab5 <- NULL
tab5_col2_coef <- NULL
tab5_col2_se <- NULL
tab5_col4_coef <- NULL
tab5_col4_se <- NULL

for (i in 1984:1997) {
  i <- as.character(i)
  data_table5 <- statadata[which(statadata$year>1984 & statadata$year<2001), ]
  data_table5_nm <- na.omit(data_table5[c("lsales", "lemp1", "lppent1", "lgrd1", "lgrd1_dum", paste("lgspilltec_ciq1",i,sep=""), "lgspillsic_ciq1", paste("lgspillmaltec_ciq1",i,sep=""), "lgspillmalsic_ciq1", "lgspilltecIV1", "lgspillsicIV1", "lsales_ind", "lsales_ind1", "lpind_ind", "year", "num")])
  N_tab5[i] <- nrow(data_table5_nm)

  #Table 5, Column 2: OLS Jaffe, with firm fixed effects
  model <- paste("lsales ~ lgspilltec_ciq1",i," + lgspillsic_ciq1 + lppent1 + lemp1 + lgrd1 + lgrd1_dum + lsales_ind + lsales_ind1 + lpind_ind + factor(year) + factor(num)",sep="")
  t5c2 <- lm(model, data = data_table5_nm)
  t5c2_se <- sqrt(diag(NeweyWest(t5c2, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
  t5c2_results <- cbind(t5c2$coef,t5c2_se)
  tab5_col2_coef[i] <- t5c2_results[paste("lgspilltec_ciq1",i,sep=""),1]
  tab5_col2_se[i] <- t5c2_results[paste("lgspilltec_ciq1",i,sep=""),2]
  
  #Table 5, Column 4: OLS Mahalanobis
  model <- paste("lsales ~ lgspillmaltec_ciq1",i," + lgspillmalsic_ciq1 + lppent1 + lemp1 + lgrd1 + lgrd1_dum + lsales_ind + lsales_ind1 + lpind_ind + factor(year) + factor(num)",sep="")
  t5c4 <- lm(model, data = data_table5_nm)
  t5c4_se <- sqrt(diag(NeweyWest(t5c4, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
  t5c4_results <- cbind(t5c4$coef,t5c4_se)
  tab5_col4_coef[i] <- t5c4_results[paste("lgspillmaltec_ciq1",i,sep=""),1]
  tab5_col4_se[i] <- t5c4_results[paste("lgspillmaltec_ciq1",i,sep=""),2]  
}  
  

N_tab6 <- NULL
tab6_col2_coef <- NULL
tab6_col2_se <- NULL
tab6_col4_coef <- NULL
tab6_col4_se <- NULL

for (i in 1984:1997) {
  i <- as.character(i)
  data_table6 <- statadata[which(statadata$year>1984 & statadata$year<2001), ]
  data_table6_nm <- na.omit(data_table6[c("lxrd_sales", paste("lgspilltec_ciq1",i,sep=""), "lgspillsic_ciq1", paste("lgspillmaltec_ciq1",i,sep=""), "lgspillmalsic_ciq1", "lgspilltecIV1", "lgspillsicIV1", "lsales_ind", "lsales_ind1", "lfirm", "firm_dum", "lstate", "year", "num")])
  N_tab6[i] <- nrow(data_table6_nm)
  
  #Table 6, Column 2: OLS Jaffe, with firm fixed effects
  model <- paste("lxrd_sales ~ lgspilltec_ciq1",i," + lgspillsic_ciq1 + lsales_ind + lsales_ind1 + factor(year) + factor(num)",sep="")
  t6c2 <- lm(model, data = data_table6_nm)
  t6c2_se <- sqrt(diag(NeweyWest(t6c2, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
  t6c2_results <- cbind(t6c2$coef,t6c2_se)
  head(t6c2_results)
  tab6_col2_coef[i] <- t6c2_results[paste("lgspilltec_ciq1",i,sep=""),1]
  tab6_col2_se[i] <- t6c2_results[paste("lgspilltec_ciq1",i,sep=""),2]

  #Table 6, Column 4: OLS Mahalanobis, otherwise same as column 2
  model <- paste("lxrd_sales ~ lgspillmaltec_ciq1",i," + lgspillmalsic_ciq1 + lsales_ind + lsales_ind1 + factor(year) + factor(num)",sep="")
  t6c4 <- lm(model, data = data_table6_nm)
  t6c4_se <- sqrt(diag(NeweyWest(t6c4, lag = 1, order.by = NULL, prewhite = FALSE, adjust = TRUE, diagnostics = FALSE, sandwich = TRUE, verbose = TRUE)))
  t6c4_results <- cbind(t6c4$coef,t6c4_se)
  tab6_col4_coef[i] <- t6c4_results[paste("lgspillmaltec_ciq1",i,sep=""),1]
  tab6_col4_se[i] <- t6c4_results[paste("lgspillmaltec_ciq1",i,sep=""),2]
}


tab3_col2_bloom <- 0.381
tab3_col5_bloom <- 0.903
tab4_col3_bloom <- 0.417
tab4_col4_bloom <- 0.530
tab5_col2_bloom <- 0.191
tab5_col4_bloom <- 0.264
tab6_col2_bloom <- 0.100
tab6_col4_bloom <- -0.176


setwd("C:\\Users\\rthavarajah\\Desktop\\Paper Stuff\\Bloom Master\\Spillovers Computation\\9 formatted outputs")

pdf("t3c2.pdf")
plot(tab3_col2_coef, ylim=c(-.5,0.5), xaxt='n', xlab="last year of 3-year rolling window", ylab="spilltec coefficient")
lines(tab3_col2_coef+1.96*tab3_col2_se, col="darkgray")
lines(tab3_col2_coef-1.96*tab3_col2_se, col="darkgray")
axis(1, at=1:14, labels=1984:1997)
abline(h = tab3_col2_bloom, untf = FALSE)
legend("bottomright","(x,y)",lty=1,col=c("darkgray","black"),c("95% C.I.","Bloom et al 2013"),bty = "n")
graphics.off()

pdf("t3c5.pdf")
plot(tab3_col5_coef, ylim=c(-.5,1), xaxt='n', xlab="last year of 3-year rolling window", ylab="spilltec coefficient")
lines(tab3_col5_coef+1.96*tab3_col2_se, col="darkgray")
lines(tab3_col5_coef-1.96*tab3_col2_se, col="darkgray")
axis(1, at=1:14, labels=1984:1997)
abline(h = tab3_col5_bloom, untf = FALSE)
legend("bottomright","(x,y)",lty=1,col=c("darkgray","black"),c("95% C.I.","Bloom et al 2013"),bty = "n")
graphics.off()

pdf("t4c3.pdf")
plot(tab4_col3_coef, ylim=c(-.1,.5), xaxt='n', xlab="last year of 3-year rolling window", ylab="spilltec coefficient")
lines(tab4_col3_coef+1.96*tab4_col3_se, col="darkgray")
lines(tab4_col3_coef-1.96*tab4_col3_se, col="darkgray")
axis(1, at=1:14, labels=1984:1997)
abline(h = tab4_col3_bloom, untf = FALSE)
legend("bottomright","(x,y)",lty=1,col=c("darkgray","black"),c("95% C.I.","Bloom et al 2013"),bty = "n")
graphics.off()

pdf("t4c4.pdf")
plot(tab4_col4_coef, ylim=c(-.2,.6), xaxt='n', xlab="last year of 3-year rolling window", ylab="spilltec coefficient")
lines(tab4_col4_coef+1.96*tab4_col3_se, col="darkgray")
lines(tab4_col4_coef-1.96*tab4_col3_se, col="darkgray")
axis(1, at=1:14, labels=1984:1997)
abline(h = tab4_col4_bloom, untf = FALSE)
legend("bottomright","(x,y)",lty=1,col=c("darkgray","black"),c("95% C.I.","Bloom et al 2013"),bty = "n")
graphics.off()

pdf("t5c2.pdf")
plot(tab5_col2_coef, ylim=c(-.5,.5), xaxt='n', xlab="last year of 3-year rolling window", ylab="spilltec coefficient")
lines(tab5_col2_coef+1.96*tab5_col2_se, col="darkgray")
lines(tab5_col2_coef-1.96*tab5_col2_se, col="darkgray")
axis(1, at=1:14, labels=1984:1997)
abline(h = tab5_col2_bloom, untf = FALSE)
legend("bottomright","(x,y)",lty=1,col=c("darkgray","black"),c("95% C.I.","Bloom et al 2013"),bty = "n")
graphics.off()

pdf("t5c4.pdf")
plot(tab5_col4_coef, ylim=c(-.3,.5), xaxt='n', xlab="last year of 3-year rolling window", ylab="spilltec coefficient")
lines(tab5_col4_coef+1.96*tab5_col4_se, col="darkgray")
lines(tab5_col4_coef-1.96*tab5_col4_se, col="darkgray")
axis(1, at=1:14, labels=1984:1997)
abline(h = tab5_col4_bloom, untf = FALSE)
legend("bottomright","(x,y)",lty=1,col=c("darkgray","black"),c("95% C.I.","Bloom et al 2013"),bty = "n")
graphics.off()

pdf("t6c2.pdf")
plot(tab6_col2_coef, ylim=c(-.5,.4), xaxt='n', xlab="last year of 3-year rolling window", ylab="spilltec coefficient")
lines(tab6_col2_coef+1.96*tab6_col2_se, col="darkgray")
lines(tab6_col2_coef-1.96*tab6_col2_se, col="darkgray")
axis(1, at=1:14, labels=1984:1997)
abline(h = tab6_col2_bloom, untf = FALSE)
legend("bottomright","(x,y)",lty=1,col=c("darkgray","black"),c("95% C.I.","Bloom et al 2013"),bty = "n")
graphics.off()

pdf("t6c4.pdf")
plot(tab6_col4_coef, ylim=c(-1.2,.2), xaxt='n', xlab="last year of 3-year rolling window", ylab="spilltec coefficient")
lines(tab6_col4_coef+1.96*tab6_col4_se, col="darkgray")
lines(tab6_col4_coef-1.96*tab6_col4_se, col="darkgray")
axis(1, at=1:14, labels=1984:1997)
abline(h = tab6_col4_bloom, untf = FALSE)
legend("bottomright","(x,y)",lty=1,col=c("darkgray","black"),c("95% C.I.","Bloom et al 2013"),bty = "n")
graphics.off()

pdf("N.pdf")
plot(N_tab3, ylim=c(1000,10000), xaxt='n', xlab="last year of 3-year rolling window", ylab="spilltec coefficient")
axis(1, at=1:14, labels=1984:1997)
graphics.off()

shortfall_t3col2 <- mean(tab3_col2_bloom - tab3_col2_coef) / tab3_col2_bloom * 100
shortfall_t3col5 <- mean(tab3_col5_bloom - tab3_col5_coef) / tab3_col5_bloom * 100
shortfall_t4col3 <- mean(tab4_col3_bloom - tab4_col3_coef) / tab4_col3_bloom * 100
shortfall_t4col4 <- mean(tab4_col4_bloom - tab4_col4_coef) / tab4_col4_bloom * 100
shortfall_t5col2 <- mean(tab5_col2_bloom - tab5_col2_coef) / tab5_col2_bloom * 100
shortfall_t5col4 <- mean(tab5_col4_bloom - tab5_col4_coef) / tab5_col4_bloom * 100
shortfall_t6col2 <- mean(tab6_col2_bloom - tab6_col2_coef) / tab6_col2_bloom * 100
shortfall_t6col4 <- mean(tab6_col4_coef - tab6_col4_bloom) / tab6_col4_bloom * 100

shortfall_t3col2
shortfall_t3col5
shortfall_t4col3
shortfall_t4col4
shortfall_t5col2
shortfall_t5col4
shortfall_t6col2
shortfall_t6col4

